This report performs Principal Component Analysis (PCA) on TB samples combined with 1000 Genomes data to visualize population structure and identify significant principal components.
library(openxlsx)
library(ggplot2)
library(ggrepel)
library(scatterplot3d)
library(data.table)
library(reshape)
# Read in the eigenvectors produced in PLINK
pca <- read.table(params$PCA_1KG_EIGENVEC,
header = FALSE, skip = 0, sep = ' ')
rownames(pca) <- pca[,2]
pca <- pca[,3:ncol(pca)]
colnames(pca) <- paste('PC', c(1:20), sep = '')
# Read in the PED data
PED <- read.xlsx(params$SAMPLE_SHEET, sheet = params$SHEET_NUM)
PED <- PED[which(PED$Individual_ID %in% rownames(pca)), ]
PED <- PED[match(rownames(pca), PED$Individual_ID),]
cat("Sample IDs match:", all(PED$Individual_ID == rownames(pca)), "\n")
## Sample IDs match: TRUE
# Merge PCA results with metadata
pca$sample.name = rownames(pca)
plot <- merge(pca, PED, by.x = "sample.name", by.y = "Individual_ID")
# Create super population categories
plot$super.population <- ifelse(plot$Population %in% c("ACB","ASW","ESN","GWD","LWK","MSL","YRI"), "AFR",
ifelse(plot$Population %in% c("CLM","MXL","PEL","PUR"), "AMR",
ifelse(plot$Population %in% c("CDX","CHB","CHS","JPT","KHV"), "EAS",
ifelse(plot$Population %in% c("CEU","FIN","GBR","IBS","TSI"), "EUR",
ifelse(plot$Population %in% c("BEB","GIH","ITU","PJL","STU"), "SAS", "HIRV TB")))))
cat("Super population distribution:\n")
## Super population distribution:
print(table(plot$super.population))
##
## AFR AMR EAS EUR HIRV TB SAS
## 661 347 504 503 267 489
# Calculate percentage variance explained
eigenval <- scan(params$PCA_1KG_EIGENVAL)
percentVar <- data.frame(PC = 1:20, pve = sprintf("%.1f", eigenval/sum(eigenval)*100))
# Set factor levels
plot$super.population = factor(plot$super.population,
levels = c("AFR", "AMR", "EAS", "EUR", "SAS", "HIRV TB"))
# PC1 vs PC2
ggplot(plot, aes(x = PC1, y = PC2, color = super.population)) +
geom_point(alpha = 0.7, size = 2) +
labs(title = "PCA: TB Samples with 1000 Genomes Reference",
x = paste0("PC1 (", percentVar$pve[1], "%)"),
y = paste0("PC2 (", percentVar$pve[2], "%)"),
color = "Super Population") +
theme_minimal() +
theme(legend.position = "bottom")
# PC3 vs PC4
ggplot(plot, aes(x = PC3, y = PC4, color = super.population)) +
geom_point(alpha = 0.7, size = 2) +
labs(title = "PCA: PC3 vs PC4",
x = paste0("PC3 (", percentVar$pve[3], "%)"),
y = paste0("PC4 (", percentVar$pve[4], "%)"),
color = "Super Population") +
theme_minimal() +
theme(legend.position = "bottom")
# 3D PCA for HIRV TB samples only
plot2 = plot[plot$super.population %in% "HIRV TB",]
colors <- c("#999999", "#E69F00", "#56B4E9", "red", "purple")
colors <- colors[as.numeric(as.factor(plot2$Population))]
s3d <- scatterplot3d(plot2$PC1, plot2$PC2, plot2$PC3,
xlab = "PC1", ylab = "PC2", zlab = "PC3",
pch = 16, color = colors, main = "3D PCA: HIRV TB Samples")
legend("right", legend = levels(as.factor(plot2$Population)),
col = c("#999999", "#E69F00", "#56B4E9", "red", "purple"), pch = 16)
# Read in eigenvectors for TB samples only
pca_tb <- read.table(params$PCA_TB_EIGENVEC,
header = FALSE, skip = 0, sep = ' ')
rownames(pca_tb) <- pca_tb[,2]
pca_tb <- pca_tb[,3:ncol(pca_tb)]
colnames(pca_tb) <- paste('PC', c(1:20), sep = '')
# Merge with metadata
pca_tb$sample.name = rownames(pca_tb)
plot_tb <- merge(pca_tb, PED, by.x = "sample.name", by.y = "Individual_ID")
cat("Population distribution in TB samples:\n")
## Population distribution in TB samples:
print(table(plot_tb$Population))
##
## AFR AMR EAS EUR SAS
## 74 9 44 85 55
# Calculate percentage variance explained for TB-only data
eigenval_tb <- scan(params$PCA_TB_EIGENVAL)
percentVar_tb <- data.frame(PC = 1:20, pve = sprintf("%.1f", eigenval_tb/sum(eigenval_tb)*100))
# 2D PCA for TB samples
ggplot(plot_tb, aes(x = PC1, y = PC2, color = Population)) +
geom_point(alpha = 0.7, size = 3) +
labs(title = "PCA: TB Samples Only",
x = paste0("PC1 (", percentVar_tb$pve[1], "%)"),
y = paste0("PC2 (", percentVar_tb$pve[2], "%)")) +
theme_minimal() +
theme(legend.position = "bottom")
# 3D PCA for TB samples
colors_tb <- c("#999999", "#E69F00", "#56B4E9", "red", "purple")
colors_tb <- colors_tb[as.numeric(as.factor(plot_tb$Population))]
s3d_tb <- scatterplot3d(plot_tb$PC1, plot_tb$PC2, plot_tb$PC3,
xlab = "PC1", ylab = "PC2", zlab = "PC3",
pch = 16, color = colors_tb,
main = "3D PCA: All TB Samples")
legend("right", legend = levels(as.factor(plot_tb$Population)),
col = c("#999999", "#E69F00", "#56B4E9", "red", "purple"), pch = 16)
# Test each PC for association with population
dat2 = list()
t = colnames(plot_tb[,2:21]) # 20 PCs
for (i in t) {
formula <- as.formula(paste(i, "~ Population"))
fit = lm(formula, data = plot_tb)
dat2[[i]] = data.frame(
beta.AMR = coef(fit)[2],
beta.EAS = coef(fit)[3],
beta.EUR = coef(fit)[4],
beta.SAS = coef(fit)[5],
p.valueAMR = coef(summary(fit))[2,4],
p.valueEAS = coef(summary(fit))[3,4],
p.valueEUR = coef(summary(fit))[4,4],
p.valueASA = coef(summary(fit))[5,4],
PC = gsub("PC", "", i)
)
}
dat2 = as.data.frame(do.call(rbind, dat2))
# Adjust p-values for multiple testing
dat2$adjp.AMR = p.adjust(dat2$p.valueAMR, method = "fdr")
dat2$adjp.EAS = p.adjust(dat2$p.valueEAS, method = "fdr")
dat2$adjp.EUR = p.adjust(dat2$p.valueEUR, method = "fdr")
dat2$adjp.ASA = p.adjust(dat2$p.valueASA, method = "fdr")
cat("Significant PC associations found:\n")
## Significant PC associations found:
print(dat2[dat2$adjp.AMR < 0.05 | dat2$adjp.EAS < 0.05 |
dat2$adjp.EUR < 0.05 | dat2$adjp.ASA < 0.05, ])
## beta.AMR beta.EAS beta.EUR beta.SAS p.valueAMR
## PC1 0.11716604 0.132404184 0.129304677 0.126388815 1.390430e-40
## PC2 -0.01118028 0.124866014 -0.056439989 0.002814405 1.491730e-02
## PC3 0.02558498 0.038908064 0.034978540 -0.117702057 1.485887e-05
## PC4 -0.02570894 -0.022955059 -0.044002112 -0.024210739 2.210186e-01
## PC5 0.23104468 -0.015369855 -0.017417395 -0.002417023 5.574076e-38
## PC6 0.08191100 -0.001768767 0.003845047 -0.009570937 1.290690e-04
## PC11 -0.05598031 -0.004919880 -0.003174231 -0.006628056 9.841831e-03
## p.valueEAS p.valueEUR p.valueASA PC adjp.AMR adjp.EAS
## PC1 1.216213e-96 1.760596e-111 1.528568e-98 1 2.780859e-39 1.216213e-95
## PC2 1.474602e-137 4.195669e-79 2.223446e-01 2 4.972435e-02 2.949204e-136
## PC3 2.976649e-28 1.564692e-31 3.248112e-114 3 9.905913e-05 1.984432e-27
## PC4 4.324417e-02 4.992231e-06 2.277056e-02 4 2.957391e-01 2.162208e-01
## PC5 6.122508e-02 1.131540e-02 7.521601e-01 5 5.574076e-37 2.449003e-01
## PC6 8.764553e-01 6.857693e-01 3.687293e-01 6 6.453448e-04 9.770053e-01
## PC11 6.720650e-01 7.436359e-01 5.420597e-01 11 3.936732e-02 9.770053e-01
## adjp.EUR adjp.ASA
## PC1 3.521192e-110 1.528568e-97
## PC2 4.195669e-78 9.512302e-01
## PC3 1.043128e-30 6.496223e-113
## PC4 2.496116e-05 1.518038e-01
## PC5 4.526159e-02 9.512302e-01
## PC6 9.738379e-01 9.512302e-01
## PC11 9.738379e-01 9.512302e-01
# Prepare data for visualization
df = dat2[,c("adjp.AMR", "adjp.EAS", "adjp.EUR", "adjp.ASA", "PC")]
df = melt(df, id = "PC")
# Create bar plot of adjusted p-values
ggplot(df, aes(reorder(PC, as.numeric(PC)), -log10(value), fill = variable)) +
geom_bar(stat = "identity", position = position_dodge()) +
theme_bw() +
theme(panel.grid.minor = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "Principal Component",
y = "-Log10(Adjusted P-value)",
title = "Association of PCs with Population Structure",
fill = "Population") +
geom_hline(yintercept = -log10(0.05), linetype = "dotted", color = "red")
cat("### Analysis Summary\n")
## ### Analysis Summary
cat("- Total samples in 1KG + TB analysis:", nrow(plot), "\n")
## - Total samples in 1KG + TB analysis: 2771
cat("- Total samples in TB-only analysis:", nrow(plot_tb), "\n")
## - Total samples in TB-only analysis: 267
cat("- Number of significant PC-population associations:",
sum(dat2$adjp.EAS < 0.05 | dat2$adjp.EUR < 0.05 | dat2$adjp.ASA < 0.05), "\n")
## - Number of significant PC-population associations: 5
#cat("- PCs with ≥2 adj-p < 0.05:",
# sum(rowSums(dat2[, c("adjp.AMR","adjp.EAS","adjp.EUR","adjp.ASA")] < 0.05, na.rm = TRUE) >= 2), "\n")
sessionInfo()
## R version 4.3.2 (2023-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Rocky Linux 8.10 (Green Obsidian)
##
## Matrix products: default
## BLAS/LAPACK: FlexiBLAS OPENBLAS; LAPACK version 3.11.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/London
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] reshape_0.8.9 data.table_1.16.2 scatterplot3d_0.3-44
## [4] ggrepel_0.9.6 ggplot2_3.5.1 openxlsx_4.2.8
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 jsonlite_1.8.7 highr_0.10 dplyr_1.1.4
## [5] compiler_4.3.2 tidyselect_1.2.1 Rcpp_1.0.13-1 zip_2.3.0
## [9] jquerylib_0.1.4 scales_1.3.0 yaml_2.3.7 fastmap_1.1.1
## [13] plyr_1.8.9 R6_2.5.1 labeling_0.4.3 generics_0.1.3
## [17] knitr_1.45 tibble_3.2.1 munsell_0.5.1 bslib_0.5.1
## [21] pillar_1.9.0 rlang_1.1.4 utf8_1.2.4 cachem_1.0.8
## [25] stringi_1.8.4 xfun_0.41 sass_0.4.7 cli_3.6.3
## [29] withr_3.0.2 magrittr_2.0.3 digest_0.6.33 grid_4.3.2
## [33] lifecycle_1.0.4 vctrs_0.6.5 evaluate_0.23 glue_1.8.0
## [37] farver_2.1.2 fansi_1.0.6 colorspace_2.1-1 rmarkdown_2.25
## [41] tools_4.3.2 pkgconfig_2.0.3 htmltools_0.5.7